{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Import Dependencies"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Populating the interactive namespace from numpy and matplotlib\n",
      "Anthony Abercrombie 2017-01-28 22:09:39 \n",
      "\n",
      "CPython 3.5.2\n",
      "IPython 5.1.0\n",
      "\n",
      "numpy 1.11.2\n",
      "pandas 0.19.2+0.g825876c.dirty\n",
      "matplotlib 1.5.1\n",
      "Git hash: 5ab329262c0bf29337f14c3e151cd0f2fdb1c34e\n"
     ]
    }
   ],
   "source": [
    "# Environment at time of execution\n",
    "%load_ext watermark\n",
    "%pylab inline\n",
    "%watermark -a \"Anthony Abercrombie\" -d -t -v -p numpy,pandas,matplotlib -g"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from __future__ import print_function\n",
    "\n",
    "import os\n",
    "\n",
    "import pandas as pd\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import dotenv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import sys\n",
    "import dotenv\n",
    "import subprocess\n",
    "import glob\n",
    "from tqdm import tqdm\n",
    "\n",
    "#File path to get to the project root\n",
    "PROJ_ROOT = os.path.join(os.path.pardir, os.pardir)\n",
    "# add local python functions\n",
    "sys.path.append(os.path.join(PROJ_ROOT, \"src\"))\n",
    "\n",
    "#Load AWS keys as environment variables\n",
    "dotenv_path = os.path.join(PROJ_ROOT, '.env')\n",
    "dotenv.load_dotenv(dotenv_path)\n",
    "\n",
    "AWS_ACCESS_KEY = os.environ.get(\"AWS_ACCESS_KEY\")\n",
    "AWS_SECRET_ACCESS_KEY = os.environ.get(\"AWS_SECRET_ACCESS_KEY\")\n",
    "SPARK_HOME = os.environ.get(\"SPARK_HOME\")\n",
    "ec2_keypair = os.environ.get(\"ec2_keypair\")\n",
    "ec2_keypair_pem = os.environ.get(\"ec2_keypair_pem\")\n",
    "\n",
    "\n",
    "from __future__ import print_function\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "\n",
    "# Load the \"autoreload\" extension\n",
    "%load_ext autoreload\n",
    "# always reload modules marked with \"%aimport\"\n",
    "%autoreload 1\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### where is spark-ec2?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'~/spark-1.6.3-bin-hadoop2.6'"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "SPARK_HOME"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def spinup_spark_ec2(SPARK_HOME, keypair, keyfile, num_slaves, cluster_name):\n",
    "    bash_command = '{}/ec2/spark-ec2 -k {} -i {}> -s {} launch {}'.format(SPARK_HOME, keypair, keyfile, num_slaves, cluster_name)\n",
    "    return bash_command"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "args = (SPARK_HOME, ec2_keypair, ec2_keypair_pem, 1, 'spark_ec2_cluster')\n",
    "x = spinup_spark_ec2(*args)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'~/spark-1.6.3-bin-hadoop2.6/ec2/spark-ec2 -k aws-key-fast-ai -i ~/.ssh/aws-key-fast-ai.pem> -s 1 launch spark_ec2_cluster'"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "x"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "'{}/bin/spark-ec2 -k {}<keypair> -i {}<key-file> -s {}<num-slaves> launch {}<cluster-name>'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Numerical DataFlow with Spark and Tensorflow\n",
    "#### checkout tensorframes from databricks\n",
    "\n",
    "```python\n",
    "df = sqlContext.createDataFrame()\n",
    "x= tf.placeholder(tf.int32, name='x')\n",
    "y= tf.placeholder(tf.int32, name='y')\n",
    "output = tf.add(x, 3*y, name='z')\n",
    "\n",
    "\n",
    "session = tf.session()\n",
    "output_value = session.run(output, {x:3, y:5})\n",
    "\n",
    "output_df = tfs.map_rows(output, df)\n",
    "output_df.collect()\n",
    "\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "### Connect to master node"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def connect_master_node(SPARK_HOME, keypair, keyfile, region,cluster_name):\n",
    "    bash_cmd = '{}/ec2/spark-ec2 -k {} -i {} --region={} login {}'.format(SPARK_HOME, keypair, keyfile, region,cluster_name)\n",
    "    return bash_cmd\n",
    "args = (SPARK_HOME, ec2_keypair, ec2_keypair_pem, 'us-west-2b', 'spark_ec2_cluster')\n",
    "y = connect_master_node(*args)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'~/spark-1.6.3-bin-hadoop2.6/ec2/spark-ec2 -k aws-key-fast-ai -i ~/.ssh/aws-key-fast-ai.pem --region=us-west-2b login spark_ec2_cluster'"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### AttributeError: 'NoneType' object has no attribute 'get_all_zones'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
